**06/25/2021
**file to bring in CPD FOIA unit history files received in 2021 & assign unique IDs
 

**Housekeepig
local ver_suff _9_21
clear all
local ver_suff _9_21
clear all
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
set more off


//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

*******************
*BRING IN UNIT HISTORY DATA
*******************

//run the following to bring in the raw file
import excel using ../FOIA/FOIA_units_051821/P661041_CPD-Sworn-Employees-Assignment-Histories_UpTo_2021.xlsx, ///
           sheet(Assignment_History) firstrow clear 

**Record name
gen fname = trim(FIRST_NME)
gen lname = trim(LAST_NME)
gen mi = trim(MIDDLE_INITIAL)

drop LAST_NME FIRST_NME MIDDLE_INITIAL

**Standardize names of some variables
rename GENDER sex
rename RACE race
rename YEAR birthyear 
rename APPOINTED_DATE appoint_dt 


**Propogate appointment date across officers with the same names and birthyears
gsort fname lname mi birthyear -appoint_dt
replace appoint_dt = appoint_dt[_n-1] if fname == fname[_n-1] & lname == lname[_n-1] &   birthyear ==birthyear[_n-1]

**Record appointment date
gen appoint_yr = year(appoint_dt)
gen appoint_m = month(appoint_dt)
gen appoint_d = day(appoint_dt)		   

//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)

**Save intermediate dataset
save ../processed_data`ver_suff'/FOIA_processing/unithistory, replace
  

use ../processed_data`ver_suff'/FOIA_processing/unithistory, clear
 
**Deduplicate with respect to matching variables
duplicates drop fname lname_mod mi appoint_yr appoint_m appoint_d birthyear , force
keep fname lname lname_mod mi appoint_yr appoint_m appoint_d birthyear sex race


sort fname lname_mod mi appoint_yr appoint_m appoint_d birthyear 

**Drop if missing name information
drop if lname == ""

**Generate preliminary IDs
gen pid = _n 

**Generate soundexes for names
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)
 
**Save dataset with preliminary IDs
save ../processed_data`ver_suff'/FOIA_processing/unit_pid, replace
clear


*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/unit_pid, clear // start with 36,130

**Perfect name, appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //27,746 so far

**Perfect name, appoint date match, 1 year birth year gap
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2  

**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3  

**Perfect first name, soundex last name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  

**Soundex both names, middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  

**Soundex both names, no middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  

**first name, no middle initial, no last name and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7  

 

**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  


**first name, last name, off in birth year 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 12  


**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  
 
**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  

  
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no u_pid
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
tab dup

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'

*******************
*MERGE DE-DUPLICATED AWARDS DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/unit_pid, clear

merge 1:1 pid using `best_matches', keep(1 3)  


 
